library(VIM)
library(naniar)
library(panelView)
library(ggplot2)1. Problematyka braków danych
1 Wczytanie potrzebych pakietów
import pandas as pd
import numpy as np
import missingno as msno
import matplotlib.pyplot as plt
import seaborn as sns
from upsetty import Upset2 Przykład 1: dane przekrojowe
Wczytujemy zbiór danych data2-cross_sectional.csv
df_cross <- read.csv("../data/data2-cross_sectional.csv")
head(df_cross) y x1 x2 x3
1 -2.59353983 -2.309169 NA NA
2 -2.57878220 -1.966617 NA NA
3 -0.34323612 -1.686693 NA NA
4 0.02211028 -1.548753 NA NA
5 0.86592554 -1.265396 NA NA
6 -1.03985622 -1.265061 NA NA
Proste podsumowanie
summary(df_cross) y x1 x2 x3
Min. :-2.593540 Min. :-2.30917 Min. :-1.6179 Min. :-1.75653
1st Qu.:-0.726888 1st Qu.:-0.51570 1st Qu.:-0.7684 1st Qu.:-0.53141
Median : 0.080384 Median :-0.04523 Median :-0.2063 Median : 0.03407
Mean : 0.000809 Mean : 0.06537 Mean :-0.1191 Mean :-0.09317
3rd Qu.: 0.839176 3rd Qu.: 0.72101 3rd Qu.: 0.4566 3rd Qu.: 0.49989
Max. : 2.275646 Max. : 2.18733 Max. : 2.1001 Max. : 1.59851
NA's :20 NA's :24 NA's :72
Wizualizacja z pakietem VIM
vim_result <- aggr(x = df_cross)summary(vim_result)
Missings per variable:
Variable Count
y 0
x1 20
x2 24
x3 72
Missings in combinations of variables:
Combinations Count Percent
0:0:0:0 20 20
0:0:0:1 39 39
0:0:1:0 2 2
0:0:1:1 19 19
0:1:0:0 6 6
0:1:0:1 11 11
0:1:1:1 3 3
Wizualizacja z pakietem naniar.
vis_miss(df_cross)vis_miss(df_cross, cluster = T, sort_miss = T)gg_miss_var(df_cross)gg_miss_upset(df_cross)ggplot(data=df_cross, aes(x = x1, y)) + geom_point()Warning: Removed 20 rows containing missing values or values outside the scale range
(`geom_point()`).
ggplot(data=df_cross, aes(x = x1, y)) + geom_miss_point()df_cross = pd.read_csv("../data/data2-cross_sectional.csv")
df_cross.head() y x1 x2 x3
0 -2.593540 -2.309169 NaN NaN
1 -2.578782 -1.966617 NaN NaN
2 -0.343236 -1.686693 NaN NaN
3 0.022110 -1.548753 NaN NaN
4 0.865926 -1.265396 NaN NaN
miss_plot = msno.matrix(df_cross)
plt.show()plt.ioff()<contextlib.ExitStack object at 0x16f218a90>
miss_dendro = msno.dendrogram(df_cross)
plt.show()upset = Upset.generate_plot(df_cross.isnull())
upset.show()3 Przykład 2: dane panelowe
Wczytujemy dane w dwóch formatach: data/data2-panel_long.csv, data/data2-panel_wide.csv
df_long <- read.csv("../data/data2-panel_long.csv")
head(df_long) unit_id year y x1 x2
1 1 2015 -3.530126 -2.3622119 -2.6785088
2 1 2016 -3.204725 -2.8099225 -1.0101543
3 1 2017 -3.034548 -2.4910414 -1.8172416
4 1 2018 -2.720240 -1.4998663 -1.9648143
5 1 2019 -2.182205 -1.4482605 -1.6537966
6 1 2020 -1.840414 -0.6895582 -0.9912898
df_wide <- read.csv("../data/data2-panel_wide.csv")
head(df_wide) unit_id y.2015 x1.2015 x2.2015 y.2016 x1.2016 x2.2016
1 1 -3.530126 -2.362211936 -2.6785088 -3.2047254 -2.8099225 -1.0101543
2 2 0.568902 -0.424010177 0.9818217 0.6039925 NA NA
3 3 1.427478 0.005015081 1.2572458 0.8887155 NA NA
4 4 -3.477064 -2.502756453 -1.9826823 -2.6826293 -2.1249015 -1.7178874
5 5 -2.007676 NA NA -1.1877712 -1.8131336 -1.3596098
6 6 -2.059100 -1.190443674 -1.9837228 -1.0404844 -0.9521887 -0.6128572
y.2017 x1.2017 x2.2017 y.2018 x1.2018 x2.2018 y.2019
1 -3.0345484 -2.4910414 -1.81724159 -2.7202403 -1.4998663 -1.9648143 -2.1822049
2 0.6704031 -0.8117871 0.80569343 1.3155977 0.5093892 0.5064433 1.2432841
3 0.8473622 0.2419629 0.01145462 1.0985260 0.5821369 1.0316695 1.6015851
4 -2.5445519 NA NA -2.8655684 -1.2009330 -2.1596327 -2.3762645
5 -1.2179854 NA NA -1.2431829 -0.5268546 -0.0286618 -1.3686251
6 -1.9270956 -1.6184360 -1.53674341 -0.2946865 0.1375292 0.3467936 -0.4073164
x1.2019 x2.2019 y.2020 x1.2020 x2.2020 y.2021
1 -1.4482605 -1.6537966 -1.84041446 -0.6895582 -0.99128978 -2.25887006
2 0.6202672 0.4817378 1.76210283 1.6164227 1.00273225 1.01640057
3 1.7534910 0.3221606 1.59996491 0.9481215 -0.06525444 NA
4 -1.1665352 -1.4703061 -2.04844719 -0.9363446 -1.51867585 -2.23447897
5 -0.8229503 -0.3181881 -0.86471969 0.1961181 -0.71225250 -0.53040779
6 -0.4208893 -0.1267792 0.04081512 -0.1021241 -0.32718542 -0.04982249
x1.2021 x2.2021 y.2022 x1.2022 x2.2022 y.2023
1 -1.3144860 -0.9215337 -2.03611946 -1.0736086 -1.82785228 -0.952456617
2 0.5519369 0.3990350 1.56796215 0.9857544 1.19197201 1.879899271
3 NA NA NA NA NA NA
4 -1.3350168 -1.8754815 -1.69493361 -0.1758811 -1.23627143 -1.560217747
5 0.3277557 -1.0525771 -0.06009162 -0.9959412 0.39601553 0.004972416
6 -0.1200651 0.2559760 -0.59869442 -0.3174002 -0.08000885 -0.653598980
x1.2023 x2.2023 y.2024 x1.2024 x2.2024
1 0.006495037 -0.3005491 -1.9426917 -1.1068217 -0.5397062
2 NA NA 1.6401849 1.3837963 0.5168941
3 NA NA NA NA NA
4 NA NA -1.4017691 -0.3189286 -0.7431038
5 -0.378907252 -0.2225839 -0.8976668 -0.3185950 0.1648471
6 -0.233184890 -0.5287449 0.9094604 0.5683409 0.1542975
VIM::aggr(df_wide)panelview(data = df_long, formula = y ~ 1, index = c("unit_id", "year"), type = "missing")panelview(data = df_long, formula = x2 ~ 1, index = c("unit_id", "year"), type = "missing")panelview(data = df_long, formula = 1 ~ y + x1 + x2, index = c("unit_id", "year"), type = "missing")df_long = pd.read_csv("../data/data2-panel_long.csv")
df_long.head() unit_id year y x1 x2
0 1 2015 -3.530126 -2.362212 -2.678509
1 1 2016 -3.204725 -2.809923 -1.010154
2 1 2017 -3.034548 -2.491041 -1.817242
3 1 2018 -2.720240 -1.499866 -1.964814
4 1 2019 -2.182205 -1.448260 -1.653797
df_wide = pd.read_csv("../data/data2-panel_wide.csv")
df_wide.head() unit_id y.2015 x1.2015 ... y.2024 x1.2024 x2.2024
0 1 -3.530126 -2.362212 ... -1.942692 -1.106822 -0.539706
1 2 0.568902 -0.424010 ... 1.640185 1.383796 0.516894
2 3 1.427478 0.005015 ... NaN NaN NaN
3 4 -3.477064 -2.502756 ... -1.401769 -0.318929 -0.743104
4 5 -2.007676 NaN ... -0.897667 -0.318595 0.164847
[5 rows x 31 columns]
missing_pattern = df_long.pivot(index='unit_id', columns='year', values='y').isnull()
plt.figure(figsize=(12, 8))
sns.heatmap(missing_pattern, cbar=True, cmap='binary')
plt.title('Panel View of Missing Data')
plt.show()